import datetime
import re, json
from urllib import parse
from PIL import Image
import scrapy
from scrapy import cmdline
from items import QuestionItem

__author__ = 'hexun'

if __name__ == "__main__":
    cmdline.execute("scrapy crawl zhihu".split())


class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["zhihu.com"]
    start_urls = ["https://www.zhihu.com/"]

    headers = {
        "Host": "www.zhihu.com",
        "Referer": "https://www.zhihu.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
    }

    def start_requests(self):
        return [scrapy.Request(url="https://www.zhihu.com/#signin", headers=self.headers, meta={'cookiejar':1}, callback=self.login)]

    def parse(self, response):
        ques_nodes = response.css("div.TopstoryMain .TopstoryItem")
        for node in ques_nodes:
            title_node = node.css(".ContentItem ContentItem-title")
            ques_url = title_node.css("div a::attr(href)").extract_first('')
            title = title_node.css("div a::text").extract_first('')
            print(ques_url, title)
            yield scrapy.Request(url=parse.urljoin(response.url, ques_url), meta={}, callback=self.parse_detail)

    def parse_detail(self, response):
        item = QuestionItem()
        main_node = response.css("div.QuestionHeader .QuestionHeader-main")
        status_node = response.css("div.QuestionHeader-side .QuestionFollowStatus .QuestionFollowStatus-counts")
        title = main_node.css("h1::text").extract_first("")
        url = response.url
        content = main_node.css(".QuestionHeader-detail .QuestionRichText span::text").extract_first("")
        watch_user_num = status_node.css("button.NumberBoard-item .NumberBoard-value::text").extract_first("")
        click_num = status_node.css("div.NumberBoard-item .NumberBoard-value::text").extract_first("")
        comments_num = response.css("div.QuestionHeader-footer .QuestionHeader-Comment button::text").extract_first("")
        crawl_time = datetime.datetime.now()
        print(content)
        yield item

    def login(self, response):
        # 1.获取_xsrf
        re_match = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.S)
        xsrf = ''
        if re_match:
            xsrf = re_match.group(1)
        if xsrf:
            post_data = {
                "_xsrf": xsrf,
                "account": "gaohui7141@163.com",
                "password": "gaohui4850074",
                "captcha": "",
                "captcha_type": "cn"
            }
            import time

            r = str(int(time.time() * 1000))
            captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login&lang=cn".format(r)
            
            # 2.取验证码
            #yield scrapy.Request(captcha_url, meta={"post_data": post_data, "cookiejar":response.meta.get('cookiejar')}, headers=self.headers,
            #                     callback=self.check_captcha)
            
            
            post_url = "https://www.zhihu.com/login/email"
            # 3.提交请求
            return [scrapy.FormRequest(
                url=post_url,
                formdata=post_data,
                headers=self.headers,
                meta={'cookiejar': response.meta['cookiejar']},
                callback=self.check_login,
                dont_filter = True
            )]

    # 2.取验证码
    def check_captcha(self, response):
        
        with open("captcha.gif", 'wb') as f:
            f.write(response.body)
            f.close()
        '''
        try:
            im = Image.open("captcha.gif")
            im.show()
        except:
            print("no captcha")
            pass

        captcha = input('请输入验证码：')
        '''
        post_data = response.meta.get("post_data", {})
        # post_data["captcha"] = captcha
        post_url = "https://www.zhihu.com/login/email"
        # 3.提交请求
        return [scrapy.FormRequest(
            url=post_url,
            formdata=post_data,
            headers=self.headers,
            meta={'cookiejar': response.meta['cookiejar']},
            callback=self.check_login
        )]

    def check_login(self, response):
        json_text = json.loads(response.text)
        print(json_text)
        for url in self.start_urls:
            yield scrapy.Request(url, dont_filter=True, headers=self.headers)
